%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings
warnings.filterwarnings('ignore')
data=pd.read_csv('housing.csv')
data.head()
| longitude | latitude | housing_median_age | total_rooms | total_bedrooms | population | households | median_income | median_house_value | ocean_proximity | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | -122.23 | 37.88 | 41.0 | 880.0 | 129.0 | 322.0 | 126.0 | 8.3252 | 452600.0 | NEAR BAY |
| 1 | -122.22 | 37.86 | 21.0 | 7099.0 | 1106.0 | 2401.0 | 1138.0 | 8.3014 | 358500.0 | NEAR BAY |
| 2 | -122.24 | 37.85 | 52.0 | 1467.0 | 190.0 | 496.0 | 177.0 | 7.2574 | 352100.0 | NEAR BAY |
| 3 | -122.25 | 37.85 | 52.0 | 1274.0 | 235.0 | 558.0 | 219.0 | 5.6431 | 341300.0 | NEAR BAY |
| 4 | -122.25 | 37.85 | 52.0 | 1627.0 | 280.0 | 565.0 | 259.0 | 3.8462 | 342200.0 | NEAR BAY |
data.shape
(20640, 10)
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 20640 entries, 0 to 20639 Data columns (total 10 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 longitude 20640 non-null float64 1 latitude 20640 non-null float64 2 housing_median_age 20640 non-null float64 3 total_rooms 20640 non-null float64 4 total_bedrooms 20433 non-null float64 5 population 20640 non-null float64 6 households 20640 non-null float64 7 median_income 20640 non-null float64 8 median_house_value 20640 non-null float64 9 ocean_proximity 20640 non-null object dtypes: float64(9), object(1) memory usage: 1.6+ MB
data.isnull().sum()
longitude 0 latitude 0 housing_median_age 0 total_rooms 0 total_bedrooms 207 population 0 households 0 median_income 0 median_house_value 0 ocean_proximity 0 dtype: int64
data.total_bedrooms = data.total_bedrooms.fillna(data.total_bedrooms.mode()[0])
numeric_columns= data.select_dtypes(exclude='object')
for col in numeric_columns:
fig, axs = plt.subplots(1,3,figsize=(15,5))
sns.histplot(data=data, x=col, bins=30,kde=True, ax = axs[0])
sns.boxplot(x=data[col], ax=axs[1])
sns.barplot(data= data[col].describe().reset_index(), x='index', y=col, ax=axs[2])
def find_boundry(df,variable):
Q1 =df[variable].quantile(.25)
Q3 =df[variable].quantile(.75)
IQR = Q3 - Q1
lower_boundry = Q1 - 1.5*IQR
upper_boundry = Q3 + 1.5*IQR
return lower_boundry ,upper_boundry
def treat_outliers(df,variable):
lower_boundry ,upper_boundry = find_boundry(df,variable)
df[variable] = np.where(df[variable] > upper_boundry, upper_boundry, df[variable])
df[variable] = np.where(df[variable] < lower_boundry, lower_boundry, df[variable])
return df.head()
treat_outliers(data,'total_rooms')
treat_outliers(data,'total_bedrooms')
treat_outliers(data,'population')
treat_outliers(data,'households')
treat_outliers(data,'median_income')
treat_outliers(data,'median_house_value')
| longitude | latitude | housing_median_age | total_rooms | total_bedrooms | population | households | median_income | median_house_value | ocean_proximity | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | -122.23 | 37.88 | 41.0 | 880.000 | 129.0 | 322.0 | 126.0 | 8.013025 | 452600.0 | NEAR BAY |
| 1 | -122.22 | 37.86 | 21.0 | 5698.375 | 1106.0 | 2401.0 | 1092.5 | 8.013025 | 358500.0 | NEAR BAY |
| 2 | -122.24 | 37.85 | 52.0 | 1467.000 | 190.0 | 496.0 | 177.0 | 7.257400 | 352100.0 | NEAR BAY |
| 3 | -122.25 | 37.85 | 52.0 | 1274.000 | 235.0 | 558.0 | 219.0 | 5.643100 | 341300.0 | NEAR BAY |
| 4 | -122.25 | 37.85 | 52.0 | 1627.000 | 280.0 | 565.0 | 259.0 | 3.846200 | 342200.0 | NEAR BAY |
for col in numeric_columns:
fig, axs = plt.subplots(1,3,figsize=(15,5))
sns.histplot(data=data, x=col, bins=30,kde=True, ax = axs[0])
sns.boxplot(x=data[col], ax=axs[1])
sns.barplot(data= data[col].describe().reset_index(), x='index', y=col, ax=axs[2])
plt.figure(figsize=(20,20))
sns.pairplot(data)
plt.show()
<Figure size 2000x2000 with 0 Axes>
plt.figure(figsize=(8,8))
sns.heatmap(data.corr(),annot=True)
<AxesSubplot:>
data.head()
| longitude | latitude | housing_median_age | total_rooms | total_bedrooms | population | households | median_income | median_house_value | ocean_proximity | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | -122.23 | 37.88 | 41.0 | 880.000 | 129.0 | 322.0 | 126.0 | 8.013025 | 452600.0 | NEAR BAY |
| 1 | -122.22 | 37.86 | 21.0 | 5698.375 | 1106.0 | 2401.0 | 1092.5 | 8.013025 | 358500.0 | NEAR BAY |
| 2 | -122.24 | 37.85 | 52.0 | 1467.000 | 190.0 | 496.0 | 177.0 | 7.257400 | 352100.0 | NEAR BAY |
| 3 | -122.25 | 37.85 | 52.0 | 1274.000 | 235.0 | 558.0 | 219.0 | 5.643100 | 341300.0 | NEAR BAY |
| 4 | -122.25 | 37.85 | 52.0 | 1627.000 | 280.0 | 565.0 | 259.0 | 3.846200 | 342200.0 | NEAR BAY |
#Using LabelEncoder To transform the data
from sklearn.preprocessing import LabelEncoder
l=LabelEncoder()
for i in data.columns:
if data[i].dtype == 'object':
data[i]=l.fit_transform(data[i])
data['room_per_household'] =data.total_rooms/data.households
data['avg_bedrooms']=data.total_bedrooms/data.total_rooms
data['population/households'] =data.households/data.population
data['total_area_of_rooms'] =data.total_rooms+data.total_bedrooms
data.shape
(20640, 14)
plt.figure(figsize=(16,10))
sns.heatmap(data.corr(), annot=True, cmap="YlGnBu")
<AxesSubplot:>
from sklearn.model_selection import train_test_split
features = data.drop('median_house_value',axis=1)
target = data['median_house_value']
X_train,X_test,y_train,y_test = train_test_split(features,target,test_size=0.2 ,random_state=0)
X_train.shape,X_test.shape,y_train.shape,y_test.shape
((16512, 13), (4128, 13), (16512,), (4128,))
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge, Lasso
from sklearn.metrics import mean_absolute_error
from lightgbm import LGBMRegressor
from xgboost import XGBRFRegressor
def regressor_rmse(X_train, y_train, X_test, y_test):
for model in [LinearRegression, SVR, Lasso, Ridge, KNeighborsRegressor]:
regressor = model()
regressor.fit(X_train, y_train)
predictions = regressor.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, predictions))
print('RMSE for {} is {}'.format(model.__name__, rmse))
#regressor_rmse(X_train, y_train, X_test, y_test)
from sklearn.model_selection import GridSearchCV
def hyperparameter_tuning(model, train_data, target_data, param_grid):
# Create the GridSearchCV object
grid_search = GridSearchCV(model, param_grid, cv=5, return_train_score=True)
# Fit the model using the training data and target values
grid_search.fit(train_data, target_data)
# Get the best estimator from the search
best_model = grid_search.best_estimator_
# Return the best hyperparameters from the search and the best model
return {'best_params': grid_search.best_params_, 'best_model': best_model}
lgbm_model = LGBMRegressor(random_state=101)
param_grid = {
"n_estimators":[10,100,250,300],
"min_child_samples" :[20,70,100],
"num_leaves": [15,50],
"max_depth": [5, 15],
}
hyperparameter_tuning(lgbm_model,features,target,param_grid)
{'best_params': {'max_depth': 15,
'min_child_samples': 70,
'n_estimators': 250,
'num_leaves': 15},
'best_model': LGBMRegressor(max_depth=15, min_child_samples=70, n_estimators=250,
num_leaves=15, random_state=101)}
best_lgbm_model = LGBMRegressor(max_depth=15, min_child_samples=70, n_estimators=250,
num_leaves=15, random_state=101)
best_lgbm_model.fit(X_train,y_train)
LGBMRegressor(max_depth=15, min_child_samples=70, n_estimators=250,
num_leaves=15, random_state=101)
test_pred= best_lgbm_model.predict(X_test)
np.sqrt(mean_squared_error(y_test, test_pred))
44315.336599218215
pd.DataFrame({'actual':y_test, 'predicted': test_pred})
| actual | predicted | |
|---|---|---|
| 14740 | 136900.0 | 140941.098879 |
| 10101 | 241300.0 | 261363.085993 |
| 20566 | 200700.0 | 147172.674029 |
| 2670 | 72500.0 | 66280.827409 |
| 15709 | 460000.0 | 465845.150060 |
| ... | ... | ... |
| 6655 | 169500.0 | 203877.415146 |
| 3505 | 204600.0 | 198540.108607 |
| 1919 | 128600.0 | 133469.431245 |
| 1450 | 259500.0 | 231089.680272 |
| 4148 | 167600.0 | 193282.365049 |
4128 rows × 2 columns
#dt_model = DecisionTreeRegressor()
#param_grid = {
# "max_depth": [5, 15],
# "min_samples_leaf": [2, 25],
# "max_features": [0.1, 0.9]
# }
#hyperparameter_tuning(dt_model,features,target,param_grid)
#model = DecisionTreeRegressor(max_depth=15, max_features=0.9, min_samples_leaf=25,
# random_state=101)
#model.fit(X_train,y_train)
#test_pred = model.predict(X_test)
#model.feature_importances_
#model.feature_names_in_
#pd.DataFrame({"features":model.feature_names_in_ ,"importance":model.feature_importances_}).sort_values("importance",ascending=False)
#np.sqrt(mean_squared_error(y_test, test_pred))
#pd.DataFrame({'actual':y_test, 'predicted': test_pred})
from sklearn.model_selection import RandomizedSearchCV
def randomSearchCV(estimator, X_train, y_train, param_distributions, n_iter, cv=5, scoring=None, n_jobs=-1, random_state=42):
# Initializing random search
random_search = RandomizedSearchCV(estimator=estimator,
param_distributions=param_distributions,
n_iter=n_iter,
cv=cv,
scoring=scoring,
n_jobs=n_jobs,
random_state=random_state)
# Fit the model using the training data and target values
random_search.fit(X_train, y_train)
# Get the best estimator from the search
best_model = random_search.best_estimator_
# Return the best hyperparameters from the search and the best model
return {'best_params': random_search.best_params_, 'best_model': best_model}
#randomforest_model = RandomForestRegressor()
#param_distributions ={
# "n_estimators": (10,100, 250),
# "max_depth": (5, 15),
# "min_samples_leaf": (2, 25),
# "max_features": (0.1, 0.9),
# }
#randomSearchCV(randomforest_model,features,target, param_distributions, n_iter=10, cv=5, scoring=None, n_jobs=-1,
# random_state=101)
#model =RandomForestRegressor(max_depth=15, max_features=0.9, min_samples_leaf=2,
# n_estimators=250, random_state=101)
#model.fit(X_train,y_train)
#test_pred= model.predict(X_test)
#np.sqrt(mean_squared_error(y_test, test_pred))
#pd.DataFrame({"features":model.feature_names_in_ ,"importance":model.feature_importances_}).sort_values("importance",ascending=False)
#pd.DataFrame({'actual':y_test, 'predicted': test_pred})
#!pip install bayesian-optimization
#from sklearn.model_selection import cross_val_score
#from bayes_opt import BayesianOptimization
#def rfc_cv(n_estimators, max_depth, min_samples_leaf, max_features, data, targets):
# estimator = RandomForestRegressor(
# n_estimators=n_estimators,
# max_depth=max_depth,
# min_samples_leaf=min_samples_leaf,
# max_features=max_features,
# random_state=121
# )
# cval = cross_val_score(estimator, data, targets,
# scoring=None, cv=3)
# return cval.mean()
#def rfc_crossval(n_estimators, max_depth, min_samples_leaf, max_features):
# return rfc_cv(
# n_estimators=int(n_estimators),
# max_depth=int(max_depth),
# min_samples_leaf=int(min_samples_leaf),
# max_features=max(min(max_features, 0.999), 1e-3),
# data=X_train,
# targets=y_train,
# )
#optimizer = BayesianOptimization(
# f=rfc_crossval,
# pbounds={
# "n_estimators": (10, 250),
# "max_depth": (5,15),
# "min_samples_leaf": (2, 25),
# "max_features": (0.1, 0.999),
# },
# random_state=111,
# verbose=2
# )
#optimizer.maximize(n_iter=10)
#best_params = optimizer.max['params']
#best_params['max_depth'] = int(best_params['max_depth'])
#best_params['max_features'] = best_params['max_features']
#best_params['min_samples_leaf'] = int(best_params['min_samples_leaf'])
#best_params['n_estimators'] = int(best_params['n_estimators'])
#best_model = RandomForestRegressor(random_state=121)
#best_model.set_params(**best_params)
#best_model.fit(X_train, y_train)
#model_1 =RandomForestRegressor(max_depth=13, max_features=0.4667195539991278,
# min_samples_leaf=14, n_estimators=226, random_state=121)
#model_1.fit(X_train,y_train)
#test_pred= model_1.predict(X_test)
#np.sqrt(mean_squared_error(y_test, test_pred))
#pd.DataFrame({"features":model.feature_names_in_ ,"importance":model.feature_importances_}).sort_values("importance",ascending=False)
#pd.DataFrame({'actual':y_test, 'predicted': test_pred})
datasets = [data]
#titles = []
data_summary = pd.DataFrame({},)
#data_summary['datasets']= titles
data_summary['columns'] = [', '.join([col for col, null in data.isnull().sum().items() ]) for data in datasets]
data_summary['total_rows']= [data.shape[0] for data in datasets]
data_summary['total_cols']= [data.shape[1] for data in datasets]
data_summary['total_duplicate']= [len(data[data.duplicated()]) for data in datasets]
data_summary['total_null']= [data.isnull().sum().sum() for data in datasets]
data_summary['null_cols'] = [', '.join([col for col, null in data.isnull().sum().items() if null > 0]) for data in datasets]
data_summary.style.background_gradient(cmap='YlGnBu')
| columns | total_rows | total_cols | total_duplicate | total_null | null_cols | |
|---|---|---|---|---|---|---|
| 0 | longitude, latitude, housing_median_age, total_rooms, total_bedrooms, population, households, median_income, median_house_value, ocean_proximity | 20640 | 10 | 0 | 207 | total_bedrooms |